## Uncomment and run this cell to install pandas
#!pip install pandas
#!pip install openpyxl
import pandas as pd
from dataidea.datasets import loadDataset
# to check python version
pd.__version__
Creating Dataframes
creating dataframes from existing files
# loading an excel file into a dataframe
data = loadDataset('demo' )
The data structure that is returned by the statement is called a DataFrame
# checking the datatype of the data object
type (data)
pandas.core.frame.DataFrame
# randomly sample some values
data.sample(n= 5 )
171
24
m
0
5
13
1
1
92
61
m
1
18
23
1
3
72
36
m
0
8
51
3
1
3
24
m
1
4
26
2
1
21
36
f
0
6
39
2
1
Creating a DataFrame from a Dictionary
We can create pandas dataframes using two major ways: - Using a dictionary - Using a 2D list
# create a pandas dataframe using a dictionary
data_dictionary = {
'age' : [65 , 51 , 45 , 38 , 40 ],
'gender' : ['m' , 'm' , 'm' , 'f' , 'm' ],
'income' : [42 , 148 , 147 , 43 , 89 ]
}
dataframe_from_dict = pd.DataFrame(data= data_dictionary)
# display the dataframe
dataframe_from_dict
0
65
m
42
1
51
m
148
2
45
m
147
3
38
f
43
4
40
m
89
# creating a dataframe from a 2D list
data_list = [
[28 , 'm' , 24 ],
[59 , 'm' , 841 ],
[54 , 'm' , 741 ],
[83 , 'f' , 34 ],
[34 , 'm' , 98 ]
]
dataframe_from_list = pd.DataFrame(data= data_list,
columns= ['age' , 'gender' , 'income' ])
# display the dataframe
dataframe_from_list
0
28
m
24
1
59
m
841
2
54
m
741
3
83
f
34
4
34
m
98
# Finding more information
# help(pd.DataFrame)
## Another way to find more information
# ?pd.DataFrame
Concatenating DataFrames
Sometimes there’s a need to add two or more dataframes. To perform this, for the start, we can use the pd.concat()
. Below is some illustration
concatenated_dataframe = pd.concat([dataframe_from_dict, dataframe_from_list], ignore_index= True )
0
65
m
42
1
51
m
148
2
45
m
147
3
38
f
43
4
40
m
89
5
28
m
24
6
59
m
841
7
54
m
741
8
83
f
34
9
34
m
98
Sampling values in the DataFrame
# We can have look at the top part
concatenated_dataframe.head(n= 3 )
0
65
m
42
1
51
m
148
2
45
m
147
# We can look at the bottom part
concatenated_dataframe.tail(n= 3 )
7
54
m
741
8
83
f
34
9
34
m
98
# We can also randomly sample out some values in a DataFrame
concatenated_dataframe.sample(n= 3 )
5
28
m
24
4
40
m
89
0
65
m
42
Selection
Selecting, Boolean Indexing and Setting
country_data = pd.DataFrame(data= {
'Country' : ['Belgium' , 'India' , 'Brazil' ],
'Capital' : ['Brussels' , 'New Delhi' , 'Brasilia' ],
'Population' : [11190846 , 1303171035 , 207847528 ]
})
country_data
0
Belgium
Brussels
11190846
1
India
New Delhi
1303171035
2
Brazil
Brasilia
207847528
# position 1
print (country_data.iloc[0 , 0 ])
print (country_data.iloc[2 , 1 ])
# position 2
print (country_data.iat[0 , 0 ])
print (country_data.iat[2 , 1 ])
Ponder:
How can you use the pd.DataFrame.iat
method to replace (or modify) a specific value in a dataframe
def locate(dataframe:pd.core.frame.DataFrame= None , row:int = 0 , column:str = None ):
'''
Selects specific item by row index and column name
'''
return dataframe.loc[row, column]
locate(country_data, 0 , 'Capital' )
Signature:
locate(
dataframe: pandas. core. frame. DataFrame = None ,
row: int = 0 ,
column: str = None ,
)
Docstring: Selects specific item by row index and column name
File: /tmp/ipykernel_4522/2733881718.py
Type: function
# using label
print (country_data.loc[0 , 'Capital' ])
print (country_data.loc[1 , 'Population' ])
# using label
print (country_data.at[2 , 'Population' ])
print (country_data.at[1 , 'Capital' ])
# picking out data from a specific column
country_data.Country
0 Belgium
1 India
2 Brazil
Name: Country, dtype: object
# another way to pick data from a specific column
country_data['Capital' ]
0 Brussels
1 New Delhi
2 Brasilia
Name: Capital, dtype: object
The data structure that is returned by the statement is called a Series
# lets check it
type (country_data['Capital' ])
pandas.core.series.Series
# Get specific row data (using index)
country_data.iloc[0 ]
Country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object
# get all rows that have a column-value matching a specific value
# eg where country is Belgium
country_data[country_data['Country' ] == 'Belgium' ]
0
Belgium
Brussels
11190846
# Think about this
country_data['Country' ] == 'Belgium'
0 True
1 False
2 False
Name: Country, dtype: bool
Dropping
0
Belgium
Brussels
11190846
1
India
New Delhi
1303171035
2
Brazil
Brasilia
207847528
# drop a column from a dataframe
country_data.drop('Country' , axis= 1 )
0
Brussels
11190846
1
New Delhi
1303171035
2
Brasilia
207847528
def calculateBMI(weight_kg, height_m, status:bool = True , round_off:bool = True ):
bmi = weight_kg / height_m ** 2
if round_off:
bmi = round (bmi)
if status:
status = 'Normal' if 18 <= bmi <= 24 else 'Not Normal'
return bmi, status
else :
return bmi
# You can drop many columns by passing in a columns list
country_data.drop(columns= ['Country' , 'Population' ], inplace= True )
0
Belgium
Brussels
11190846
1
India
New Delhi
1303171035
2
Brazil
Brasilia
207847528
# how to drop row data
country_data.drop([0 , 2 ], axis= 0 )
1
India
New Delhi
1303171035
Back to top